0.1 Discriminant Factors

library(fpc) # pamk
library(cluster) # pam
library(ape)
df=read.csv('../data/clustering_features.csv.gz')
id.vector=paste(df$collection,df$model,sep='_')
rownames(df)=id.vector
df.num=subset(df,select=-c(1:2))
colsCollection=c("#A6A9AA","#000000","#3E7CBC","#A3D2E2","#7E8082","#EDA85F","#CD2028") #labels=c('agora','bigg','ebrahim','embl','path','seed','uminho')
#############################################
### plot.rf.var.importance.by.class.heatmap
#############################################
# Plot heatmap with variable importance independent by predicted class.
# Args:
#   model: random forest model already build
#   predVar: string of column ID with predictor/variables names values
#   classVar: string of class variable in 'df'
#   title: header of the plot
plot.rf.var.importance.by.class.heatmap <- function(model,predVar,classVar,title){
  imp.df=melt(importance(model)[,1:length(model$classes)])
  colnames(imp.df)=c(predVar,classVar,'testImportance')
  # a.-Order rows
  pred.order=names(sort(importance(model)[,'MeanDecreaseAccuracy'])) # My order according to global MeandDecreaseAccuracy
  imp.df[,predVar] <- factor(imp.df[,predVar], levels = pred.order)
  ggplot(data = imp.df, aes_string(x = classVar, y = predVar, fill= 'testImportance')) + geom_tile() + scale_fill_gradient2() +
    theme(axis.text.x = element_text(angle = 270, hjust = 1)) +
    ggtitle(title)
}

#############################################
### plot.rf.var.importance.by.class.dotplot
#############################################
# Plot dotplot with variable importance independent by predicted class.
# Args:
#   model: random forest model already build
#   predVar: string of column ID with predictor/variables names values
#   classVar: string of class variable in 'df'
#   title: header of the plot
plot.rf.var.importance.by.class.dotplot <- function(model,predVar,classVar,title){
  imp.df=melt(importance(model)[,1:length(model$classes)])
  colnames(imp.df)=c(predVar,classVar,'value')
  # a.-Order rows
  pred.order=names(sort(importance(model)[,'MeanDecreaseAccuracy'])) # My order according to global MeandDecreaseAccuracy
  imp.df[,predVar] <- factor(imp.df[,predVar], levels = pred.order)
  imp.df[,classVar] <- factor(imp.df[,classVar])
  # b.-Order class
  ggplot(imp.df, aes_string(x = 'value', y = predVar, group = predVar, colour = classVar)) +
    geom_segment(aes_string(yend=predVar), xend=0, colour="grey50") +
    geom_point( size = 1) +
    scale_color_manual(values=colsCollection) +
    theme_bw() +
    facet_grid(reformulate(classVar)) +
    theme(panel.grid.major.y = element_blank()) +
    theme(text = element_text(size=12)) +
    xlab(paste(predVar," importance (Mean Decrease in Accuracy in RandomForest)",sep='')) +
    theme(axis.text.x = element_text(angle = 270, hjust = 1)) +
    theme(legend.position="none")
}

####################################################
### plot.rf.var.importance.by.class.andMean.heatmap
###################################################
# Plot heatmap with variable importance mean over all classes
# Args:
#   model: random forest model already build
#   predVar: string of column ID with predictor/variables names values
#   classVar: string of class variable in 'df'
#   title: header of the plot
plot.rf.var.importance.by.class.andMean.heatmap <- function(model,predVar,classVar,title){
  imp.df=melt(importance(model)[,1:(length(model$classes)+1)])
  colnames(imp.df)=c(predVar,classVar,'testImportance')
  # a.-Order rows
  pred.order=names(sort(importance(model)[,'MeanDecreaseAccuracy'])) # My order according to global MeandDecreaseAccuracy
  imp.df[,predVar] <- factor(imp.df[,predVar], levels = pred.order)
  # Change names classes (MeanDecreasAccuracy --> Mean)
  class.names=levels(imp.df[,classVar])
  levels(imp.df[,classVar]) <- c(class.names[1:(length(class.names)-1)],"MEAN")
  imp.df[,classVar] <- factor(imp.df[,classVar])
  ggplot(data = imp.df, aes_string(x = classVar, y = predVar, fill= 'testImportance')) + geom_tile() + scale_fill_gradient2() +
    theme(axis.text.x = element_text(angle = 270, hjust = 1)) +
    ggtitle(title)
}


####################################################
### plot.rf.var.importance.by.class.andMean.dotplot
####################################################
# Plot dotplot with variable importance mean over all classes
# Args:
#   model: random forest model already build
#   predVar: string of column ID with predictor/variables names values
#   classVar: string of class variable in 'df'
#   colorVector: vector of colors
#   nBestFeatures: number of top relevant features to show in the plot.
#   classNames: vector with ad-hoc class names.
plot.rf.var.importance.by.class.andMean.dotplot <- function(model,predVar,classVar,colorVector=NULL,nBestFeatures=NULL,classNames=NULL){
  imp.df=melt(importance(model)[,1:(length(model$classes)+1)])
  colnames(imp.df)=c(predVar,classVar,'value')
  # a.-Order rows
  pred.order=names(sort(importance(model)[,'MeanDecreaseAccuracy'])) # My order according to global MeandDecreaseAccuracy
  imp.df[,predVar] <- factor(imp.df[,predVar], levels = pred.order)
  class.names=levels(imp.df[,classVar])
  if(!is.null(classNames)){
    levels(imp.df[,classVar]) <- c(classNames,'MEAN')
  }else{
    levels(imp.df[,classVar]) <- c(class.names[1:(length(class.names)-1)],'MEAN')
  }
  imp.df[,classVar] <- factor(imp.df[,classVar])
  # b.- Subset test to show
  if(!is.null(nBestFeatures)){
    imp.df=subset(imp.df,subset=(test %in% tail(pred.order,n=nBestFeatures)))
  }
  p <- ggplot(imp.df, aes_string(x = 'value', y = predVar, group = predVar, colour = classVar)) +
    geom_segment(aes_string(yend=predVar), xend=0, colour="grey50") +
    geom_point( size = 3) +
    theme_bw() +
    facet_grid(reformulate(classVar)) +
    theme(panel.grid.major.y = element_blank()) +
    theme(text = element_text(size=16)) +
    xlab(paste(predVar," importance (Mean Decrease in Accuracy)",sep='')) +
    theme(axis.text.x = element_text(angle = 270, hjust = 1)) +
    theme(legend.position="none")
  if(!is.null(colorVector)){
    p +  scale_color_manual(values=colorVector)
  }else{
    p
  }
}
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
df.rf = subset(df, select = -c(2))
set.seed(123)
train_control <-
  trainControl(method = "cv",
               number = 10,
               savePredictions = "all")
rf_file <- "../data/rf_model_classCollection.Rdata"
if (!file.exists(rf_file)) {
  model <-
    train(
      form = collection ~ .,
      data = df.rf,
      trControl = train_control,
      method = "rf",
      ntree = 1000,
      importance = TRUE,
      localImp = TRUE,
      na.action = na.omit
    )
  save(model, file = "../data/rf_model_classCollection.Rdata")
} else {
  load(rf_file)
}
print(model$finalModel)
## 
## Call:
##  randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry, importance = TRUE,      localImp = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 65
## 
##         OOB estimate of  error rate: 0.02%
## Confusion matrix:
##         agora bigg carveme ebrahim kbase optflux path class.error
## agora     801    0       0       0     0       0    0  0.00000000
## bigg        0   36       0       0     0       0    0  0.00000000
## carveme     0    0    5511       0     0       0    0  0.00000000
## ebrahim     0    0       1      79     0       0    0  0.01250000
## kbase       0    0       0       0  1632       0    0  0.00000000
## optflux     0    0       1       0     0      78    0  0.01265823
## path        0    0       0       0     0       0 2641  0.00000000
# Variable Importance
varImpPlot(model$finalModel,type=1)

# Confusion matrix
print(confusionMatrix(model$pred$pred,model$pred$obs))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction agora  bigg carveme ebrahim kbase optflux  path
##    agora    2403     0       0       1     0       0     0
##    bigg        0   108       0       0     0       0     0
##    carveme     0     0   16530      81     0      80    10
##    ebrahim     0     0       3     158     0       0     0
##    kbase       0     0       0       0  4896       0     0
##    optflux     0     0       0       0     0     157     0
##    path        0     0       0       0     0       0  7913
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9946          
##                  95% CI : (0.9937, 0.9954)
##     No Information Rate : 0.5112          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9916          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: agora Class: bigg Class: carveme
## Sensitivity               1.00000     1.00000         0.9998
## Specificity               0.99997     1.00000         0.9892
## Pos Pred Value            0.99958     1.00000         0.9898
## Neg Pred Value            1.00000     1.00000         0.9998
## Prevalence                0.07430     0.00334         0.5112
## Detection Rate            0.07430     0.00334         0.5111
## Detection Prevalence      0.07434     0.00334         0.5164
## Balanced Accuracy         0.99998     1.00000         0.9945
##                      Class: ebrahim Class: kbase Class: optflux
## Sensitivity                0.658333       1.0000       0.662447
## Specificity                0.999907       1.0000       1.000000
## Pos Pred Value             0.981366       1.0000       1.000000
## Neg Pred Value             0.997452       1.0000       0.997514
## Prevalence                 0.007421       0.1514       0.007328
## Detection Rate             0.004886       0.1514       0.004855
## Detection Prevalence       0.004978       0.1514       0.004855
## Balanced Accuracy          0.829120       1.0000       0.831224
##                      Class: path
## Sensitivity               0.9987
## Specificity               1.0000
## Pos Pred Value            1.0000
## Neg Pred Value            0.9996
## Prevalence                0.2450
## Detection Rate            0.2447
## Detection Prevalence      0.2447
## Balanced Accuracy         0.9994
model=model$finalModel
pred=model$pred
# Change names classes (MeanDecreasAccuracy --> Mean)
class.names=levels(model$classes)
levels(model$classes) <-c('AGORA','CarveMe','Path2Models','KBase','BiGG','Ebrahim et al.','OptFlux')
model$classes <- factor(model$classes)
library(reshape2)
plot.rf.var.importance.by.class.andMean.dotplot(model,'test','collection',colorVec=c(colsCollection,'#60d660'))

plot.rf.var.importance.by.class.andMean.heatmap(model,'test','collection','Feature importance (Mean Decreasy in Accuracy in Random Forest)')

plot.rf.var.importance.by.class.andMean.dotplot(model,'test','collection',colorVec=c(colsCollection,'#60d660'),nBestFeatures=15,classNames=c('AGORA','CarveMe','Path2Models','KBase','BiGG','Ebrahim et al.','OptFlux'))

0.2 Clustering

fitPamBest <- pamk(df.num,krange=2:25)
save(fitPamBest,file='../data/fitPamBest_k2-25.Rdata')
write.table(as.matrix(fitPamBest$pamobject$clustering),paste("../data/pam_clusters_k",fitPamBest$nc,".txt",sep=""),quote=FALSE,sep='\t',col.names=NA,row.names=TRUE)
 #[1] 0.0000000 0.6994101 0.5969360 0.6976904 0.5068908 0.4770732 0.4372348
 #[8] 0.4513308 0.4319778 0.4467728 0.4305388 0.3842136 0.3688111 0.3674104
#[15] 0.3331504 0.3097679 0.3130108 0.3412444 0.3377852 0.3110066 0.3153869
#[22] 0.3001647 0.2838108 0.2866504 0.2949700
fit <- pam(df.num,2)
print(summary(silhouette(fit)))
## Silhouette of 10780 units in 2 clusters from pam(x = df.num, k = 2) :
##  Cluster sizes and average silhouette widths:
##      9112      1668 
## 0.7384107 0.8227154 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.04206 0.70796 0.80226 0.75146 0.80990 0.88255
fit <- pam(df.num,4)
print(summary(silhouette(fit)))
## Silhouette of 10780 units in 4 clusters from pam(x = df.num, k = 4) :
##  Cluster sizes and average silhouette widths:
##       803      1666      5587      2724 
## 0.7479980 0.8196234 0.8587196 0.7966747 
## Individual silhouette widths:
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.008044  0.816291  0.856216  0.828752  0.882189  0.903315
distMat <-dist(df.num)
fitH <- hclust(distMat)
SIbest=0
kbest=0
for(k in 2:25){
  si=summary(silhouette(cutree(fitH,k=k),distMat))$avg.width
  if(si>SIbest){
    SIbest=si
    kbest=k
  }
  print(paste(k,si,sep=':'))
}
## [1] "2:0.751455283570779"
## [1] "3:0.74847998211438"
## [1] "4:0.622720089625695"
## [1] "5:0.683254757159377"
## [1] "6:0.682465212661397"
## [1] "7:0.680032957416679"
## [1] "8:0.679604274071885"
## [1] "9:0.664318000860433"
## [1] "10:0.813294970511758"
## [1] "11:0.820027772126945"
## [1] "12:0.822651166977458"
## [1] "13:0.822427652636859"
## [1] "14:0.822642117309186"
## [1] "15:0.823881931204672"
## [1] "16:0.823959863839715"
## [1] "17:0.824214716658879"
## [1] "18:0.821680179851846"
## [1] "19:0.821810108215891"
## [1] "20:0.821771637422601"
## [1] "21:0.822016412264233"
## [1] "22:0.821903229749889"
## [1] "23:0.821952190236938"
## [1] "24:0.821693270568127"
## [1] "25:0.821722309859649"
si<-silhouette(cutree(fitH,k=kbest),distMat)
summary(si)
## Silhouette of 10780 units in 17 clusters from silhouette.default(x = cutree(fitH, k = kbest), dist = distMat) :
##  Cluster sizes and average silhouette widths:
##        801          2         17          3          5          7 
## 0.75249347 0.32024415 0.43875582 0.48093039 0.53719431 0.31436406 
##          2       5532         59       1631          1         31 
## 0.01565035 0.87380033 0.28647517 0.71927797 0.00000000 0.37667441 
##         20         15       2646          1          7 
## 0.29207112 0.33543354 0.83892294 0.00000000 0.22567109 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.2233  0.8038  0.8625  0.8242  0.8904  0.9120
fitH.labelModels=fitH
fitH$labels=gsub('_.*','',fitH$labels)
library(RColorBrewer)
my.palette <- brewer.pal(kbest,"Paired")
## Warning in brewer.pal(kbest, "Paired"): n too large, allowed maximum for palette Paired is 12
## Returning the palette you asked for with that many colors
cols <- colorRampPalette(my.palette)(kbest)
clusK=cutree(fitH,kbest)
plot(as.phylo(fitH), type = "fan", cex = 0.6, label.offset = 0.3, no.margin=TRUE, tip.color = cols[clusK])

plot(as.phylo(fitH), type='unrooted', cex=0.5, label.offset=0.5, no.margin=TRUE, tip.color = cols[clusK])

groups <- as.factor(cutree(fitH.labelModels, k = kbest))
write.table(
  as.matrix(groups),
  paste("../data/hclust_clusters_k", kbest, ".txt", sep = ""),
  quote = FALSE,
  sep = '\t',
  col.names = NA,
  row.names = TRUE
)
library(RColorBrewer)
library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.10.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following objects are masked from 'package:ape':
## 
##     ladderize, rotate
## The following object is masked from 'package:stats':
## 
##     cutree
colsCluster=colorspace::rainbow_hcl(kbest, c = 70, l  = 50)

clusK=cutree(fitH,kbest,order_clusters_as_data = FALSE)

# define dendrogram
fitH.dend=as.dendrogram(fitH)
collec=labels(fitH.dend)

# Specify different point types and colors for each leave
dend <- fitH.dend %>% 
  set("leaves_pch", 19) %>%  # node point type
  set("leaves_cex", 0.4) %>%  # node point size
  #set("leaves_col", colsCollection[as.factor(fitH$labels)]) %>% #node point color
  set("labels", "") %>%
  set("branches_k_color", colsCluster, k = 10)
## Warning in `labels<-.dendrogram`(dend, value = value, ...): The lengths
## of the new labels is shorter than the number of leaves in the dendrogram -
## labels are recycled.
## Warning in get_col(col, k): Length of color vector was longer than the
## number of clusters - first k elements are used
plot(dend)

# Add the colored bar
# Create a vector giving a color for each model collection
# Inspired by: https://cran.r-project.org/web/packages/dendextend/vignettes/FAQ.html
collect_type <- rep("Other", length(rownames(df.num)))
is_x <- grepl("agora", rownames(df.num))
collect_type[is_x] <- "agora"
is_x <- grepl("bigg", rownames(df.num))
collect_type[is_x] <- "bigg"
is_x <- grepl("ebrahim", rownames(df.num))
collect_type[is_x] <- "ebrahim"
is_x <- grepl("embl", rownames(df.num))
collect_type[is_x] <- "embl"
is_x <- grepl("path", rownames(df.num))
collect_type[is_x] <- "path"
is_x <- grepl("seed", rownames(df.num))
collect_type[is_x] <- "seed"
is_x <- grepl("uminho", rownames(df.num))
collect_type[is_x] <- "uminho"
collect_type <- factor(collect_type)
n_collect_types <- length(unique(collect_type))
col_collect_type <- colsCollection[collect_type]

colored_bars(col_collect_type, dend, rowLabels = "Collection")

library(ggplot2)
ggd1 <- as.ggdend(dend)
# Create a radial plot and remove labels
ggplot(ggd1, labels = FALSE) +
  scale_y_reverse(expand = c(0.2, 0)) +
  coord_polar(theta = "x")
## Warning: Removed 10779 rows containing missing values (geom_point).

0.2.0.1 Discriminant factor of clusters

df.rf = df.num
df.rf$cluster = as.factor(cutree(fitH, kbest))
set.seed(123)
rf_file = "../data/rf_model_classCluster.Rdata"
if (!file.exists(rf_file)) {
  model.cl <-
    train(
      form = cluster ~ .,
      data = df.rf,
      trControl = train_control,
      method = "rf",
      ntree = 1000,
      importance = TRUE,
      localImp = TRUE,
      na.action = na.omit,
      do.trace = 10
    )
  save(model.cl, file = rf_file)
} else {
  load(rf_file)
}
print(model.cl$finalModel)
## 
## Call:
##  randomForest(x = x, y = y, ntree = 1000, mtry = param$mtry, importance = TRUE,      localImp = TRUE, do.trace = 10) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 65
## 
##         OOB estimate of  error rate: 0.12%
## Confusion matrix:
##      1 2  3 4 5 6 7    8  9   10 11 12 13 14   15 16 17  class.error
## 1  801 0  0 0 0 0 0    0  0    0  0  0  0  0    0  0  0 0.0000000000
## 2    0 0  0 0 0 1 0    0  0    0  0  0  0  0    1  0  0 1.0000000000
## 3    0 0 17 0 0 0 0    0  0    0  0  0  0  0    0  0  0 0.0000000000
## 4    0 0  0 3 0 0 0    0  0    0  0  0  0  0    0  0  0 0.0000000000
## 5    0 0  1 0 4 0 0    0  0    0  0  0  0  0    0  0  0 0.2000000000
## 6    0 1  0 0 0 6 0    0  0    0  0  0  0  0    0  0  0 0.1428571429
## 7    0 0  2 0 0 0 0    0  0    0  0  0  0  0    0  0  0 1.0000000000
## 8    0 0  0 0 0 0 0 5531  1    0  0  0  0  0    0  0  0 0.0001807664
## 9    0 0  0 0 0 0 0    0 59    0  0  0  0  0    0  0  0 0.0000000000
## 10   0 0  0 0 0 0 0    0  0 1631  0  0  0  0    0  0  0 0.0000000000
## 11   0 0  0 0 0 0 0    0  0    1  0  0  0  0    0  0  0 1.0000000000
## 12   0 0  0 0 0 0 0    0  0    0  0 30  0  1    0  0  0 0.0322580645
## 13   0 0  0 0 0 0 0    0  0    0  0  0 20  0    0  0  0 0.0000000000
## 14   0 0  0 0 0 0 0    0  0    0  0  0  0 14    1  0  0 0.0666666667
## 15   0 0  0 0 0 0 0    0  0    0  0  0  0  0 2646  0  0 0.0000000000
## 16   0 0  0 0 0 0 0    0  1    0  0  0  0  0    0  0  0 1.0000000000
## 17   0 0  0 0 0 0 0    0  0    0  0  1  0  0    1  0  5 0.2857142857
# Variable Importance
varImpPlot(model.cl$finalModel,type=1)

# Confusion matrix
print(confusionMatrix(model.cl$pred$pred,model.cl$pred$obs))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     1     2     3     4     5     6     7     8     9    10
##         1   1923     0     0     0     0     0     0     0     0     0
##         2      0     0     0     0     0     2     0     0     0     0
##         3      0     2    39     3     2     6     3     0     0     0
##         4      0     0     0     3     0     0     0     0     0     0
##         5      0     0     0     0    10     0     0     0     0     0
##         6      0     2     0     0     0     7     0     0     0     0
##         7      0     0     0     0     0     0     0     0     0     0
##         8      0     1     0     0     0     0     0 13273    47     0
##         9      0     0     0     0     0     0     0     2    94     0
##         10     0     0     0     0     0     0     0     0     0  3915
##         11     0     0     0     0     0     0     0     0     0     0
##         12     0     0     0     0     0     0     0     0     0     0
##         13     0     0     0     0     0     0     0     0     0     0
##         14     0     0     0     0     0     0     0     0     0     0
##         15     0     1     0     0     0     0     0     0     0     0
##         16     0     0     0     0     0     0     0     0     0     0
##         17     0     0     0     0     0     0     0     0     0     0
##           Reference
## Prediction    11    12    13    14    15    16    17
##         1      0     0     0     0     0     0     0
##         2      0     0     0     0     0     0     0
##         3      0     0     0     0     0     0     0
##         4      0     0     0     0     0     0     0
##         5      0     0     0     0     0     0     0
##         6      0     0     0     0     0     0     0
##         7      0     0     0     0     0     0     0
##         8      0    25    16    13     9     0     6
##         9      0     0     0     0     0     0     0
##         10     0     0     0     0     0     0     0
##         11     0     0     0     0     0     0     0
##         12     0    49     0     1     0     0     0
##         13     0     0    32     0     0     0     0
##         14     0     0     0    25     0     0     0
##         15     0     1     0     0  6342     0     0
##         16     0     0     0     0     0     0     0
##         17     0     0     0     0     0     0    12
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9945          
##                  95% CI : (0.9935, 0.9954)
##     No Information Rate : 0.5132          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9915          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1  Class: 2 Class: 3 Class: 4  Class: 5
## Sensitivity           1.00000 0.000e+00 1.000000 0.500000 0.8333333
## Specificity           1.00000 9.999e-01 0.999380 1.000000 1.0000000
## Pos Pred Value        1.00000 0.000e+00 0.709091 1.000000 1.0000000
## Neg Pred Value        1.00000 9.998e-01 1.000000 0.999884 0.9999226
## Prevalence            0.07434 2.320e-04 0.001508 0.000232 0.0004639
## Detection Rate        0.07434 0.000e+00 0.001508 0.000116 0.0003866
## Detection Prevalence  0.07434 7.732e-05 0.002126 0.000116 0.0003866
## Balanced Accuracy     1.00000 5.000e-01 0.999690 0.750000 0.9166667
##                       Class: 6 Class: 7 Class: 8 Class: 9 Class: 10
## Sensitivity          0.4666667 0.000000   0.9998 0.666667    1.0000
## Specificity          0.9999226 1.000000   0.9907 0.999922    1.0000
## Pos Pred Value       0.7777778      NaN   0.9913 0.979167    1.0000
## Neg Pred Value       0.9996906 0.999884   0.9998 0.998176    1.0000
## Prevalence           0.0005799 0.000116   0.5132 0.005451    0.1514
## Detection Rate       0.0002706 0.000000   0.5131 0.003634    0.1514
## Detection Prevalence 0.0003479 0.000000   0.5177 0.003711    0.1514
## Balanced Accuracy    0.7332947 0.500000   0.9953 0.833294    1.0000
##                      Class: 11 Class: 12 Class: 13 Class: 14 Class: 15
## Sensitivity                 NA  0.653333  0.666667 0.6410256    0.9986
## Specificity                  1  0.999961  1.000000 1.0000000    0.9999
## Pos Pred Value              NA  0.980000  1.000000 1.0000000    0.9997
## Neg Pred Value              NA  0.998993  0.999381 0.9994582    0.9995
## Prevalence                   0  0.002900  0.001856 0.0015078    0.2455
## Detection Rate               0  0.001894  0.001237 0.0009665    0.2452
## Detection Prevalence         0  0.001933  0.001237 0.0009665    0.2453
## Balanced Accuracy           NA  0.826647  0.833333 0.8205128    0.9992
##                      Class: 16 Class: 17
## Sensitivity                 NA 0.6666667
## Specificity                  1 1.0000000
## Pos Pred Value              NA 1.0000000
## Neg Pred Value              NA 0.9997679
## Prevalence                   0 0.0006959
## Detection Rate               0 0.0004639
## Detection Prevalence         0 0.0004639
## Balanced Accuracy           NA 0.8333333
model=model.cl$finalModel
pred=model.cl$pred
library(reshape2)
colVector=c(
  "#DB9D85",
"#E2979B",
"#E494B2",
"#DF94C6",
"#D297D5",
"#BD9EDF",
"#A2A7E2",
"#80B0DE",
"#5CB7D3",
"#3EBCC3",
"#3ABEAF",
"#52BE99",
"#70BB84",
"#8DB771",
"#A7B166",
"#BCAB66",
"#CEA472",
"#DB9D85")
# agora, bigg, bigg, bigg, ebrahim+path+uminho, ~embl, embl+path, path, seed, seed
plot.rf.var.importance.by.class.andMean.dotplot(model,'test','cluster',colVector)

plot.rf.var.importance.by.class.andMean.heatmap(model,'test','cluster','Feature importance (Mean Decreasy in Accuracy in Random Forest)')